%%capture
!pip install python-google-places
!pip install langdetect
!pip install bnlp_toolkit
!wget https://www.omicronlab.com/download/fonts/kalpurush.ttf
!wget https://www.omicronlab.com/download/fonts/Siyamrupali.ttf
!pip install folium
!pip install geopandas
from googleplaces import GooglePlaces, types, lang
import time
import pandas as pd
from IPython.display import Markdown, display
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
from wordcloud import WordCloud
import re
def printmd(string):
display(Markdown(string))
from langdetect import detect
import unicodedata
import html
import folium
# Import folium MarkerCluster plugin
from folium.plugins import MarkerCluster
# Import folium MousePosition plugin
from folium.plugins import MousePosition
# Import folium DivIcon plugin
from folium.features import DivIcon
This dataset contains the list of Upazilla/Thana for Different Districts of Bangladesh.
Credit : Mobile network coverage in Bangladeshi Upazila or Thana - kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d mushfiqurrobin/network-coverage
!mkdir network-coverage
!unzip network-coverage.zip -d network-coverage
df = pd.read_csv("/content/network-coverage/Coverage.csv")
df_area = df[['Upazila_or_Thana', 'District']]
# Checking For Missing Values
total = df_area.isnull().sum().sort_values(ascending=False)
percent = (df_area.isnull().sum()/df_area.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent*100], axis=1, keys=['Total', 'Percent'])
display(missing_data.head(5))
# Checking for Duplicate Rows
df_area.duplicated().sum()
# Dropping Duplicates
df_area.drop_duplicates(keep="first", inplace=True)
df_area.reset_index(drop=True, inplace=True)
df_area.to_csv("locations.csv", index=False)
Here, I am combining the Upazilla/Thana and its dedicated District into a string and storing them into locations list. Later I will use this list for the searching query.
I have also intialized the searching radius to 2000 Meter or 2 KM.
Finally, I will store the Restaurants' information into restaurant_data
API_KEY = "YOUR API KEY"
google_places = GooglePlaces(API_KEY)
restaurant_data = []
radius = 2000
# Converting the list of Upazilla/Thana and District into a combined string
locations = []
list_areas = df_area.values.tolist()
for area in list_areas:
location_name = ', '.join([str(item) for item in area])
locations.append(location_name)
print(locations)
for location in locations:
print("---------------------", location, "-----------------------")
query_result = google_places.nearby_search(
location=location, keyword='Restaurant',
radius=radius)
if query_result:
for place in query_result.places:
place.get_details()
place_id = place.details.get('place_id')
name = place.name
latitude = place.geo_location.get('lat')
longitude = place.geo_location.get('lng')
rating = place.rating
number_of_reviews = place.details.get('user_ratings_total')
affluence = place.details.get('price_level')
address = place.formatted_address
restaurant_data.append([place_id, name, latitude, longitude, rating, number_of_reviews, affluence, address])
# print(place.details)
# print(restaurant_data)
print("--------------------- Scrapped Restaurants: ", len(restaurant_data))
time.sleep(5)
while query_result.has_next_page_token:
query_result = google_places.nearby_search(location=location, keyword='Restaurant',
radius=radius, pagetoken=query_result.next_page_token)
for place in query_result.places:
place.get_details()
place_id = place.details.get('place_id')
name = place.name
latitude = place.geo_location.get('lat')
longitude = place.geo_location.get('lng')
rating = place.rating
number_of_reviews = place.details.get('user_ratings_total')
affluence = place.details.get('price_level')
address = place.formatted_address
restaurant_data.append([place_id, name, latitude, longitude, rating, number_of_reviews, affluence, address])
# print(place.details)
# print(restaurant_data)
print("--------------------- Scrapped Restaurants: ", len(restaurant_data))
time.sleep(5)
time.sleep(5)
# Dumping the data into a DataFrame
df_restaurant = pd.DataFrame(restaurant_data, columns=['place_id', 'name', 'latitude', 'longitude', 'rating', 'number_of_reviews', 'affluence', 'address'])
df_restaurant.to_csv("restaurants.csv", index=False, encoding='utf-8')
restaurant_df = pd.read_csv("/content/restaurants.csv", encoding='utf-8')
display(restaurant_df.duplicated().sum())
There are 1945 Duplicate Data present in the dataframe.
restaurant_df.drop_duplicates(keep="first", inplace=True)
Here, I kept the address of each restaurant to check whether they are in Bangladeh or Not. As we can see below, 62 restaurants are in India.
res_not_bangladesh = restaurant_df[restaurant_df['address'].str.contains('Bangladesh')==False]
res_not_bangladesh
restaurant_df = restaurant_df[restaurant_df['address'].str.contains('Bangladesh')==True]
restaurant_df.reset_index(drop=True, inplace=True)
Now the dataframe restaurant_df contains only Bangladeshi restaurants.
def missing_value_describe(data):
# check missing values in the data
total = data.isna().sum().sort_values(ascending=False)
missing_value_pct_stats = (data.isnull().sum() / len(data)*100)
missing_value_col_count = sum(missing_value_pct_stats > 0)
# missing_value_stats = missing_value_pct_stats.sort_values(ascending=False)[:missing_value_col_count]
missing_data = pd.concat([total, missing_value_pct_stats], axis=1, keys=['Total', 'Percent'])
print("Number of rows with at least 1 missing values:", data.isna().any(axis = 1).sum())
print("Number of columns with missing values:", missing_value_col_count)
if missing_value_col_count != 0:
# print out column names with missing value percentage
print("\nMissing percentage (desceding):")
display(missing_data[:missing_value_col_count])
# plot missing values
missing = data.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar()
else:
print("No missing data!!!")
# pass a dataframe to the function
missing_value_describe(restaurant_df)
Converting the Affluence Level 1.0, 2.0, 3.0... to $, $$, $$$...
restaurant_df['affluence'] = restaurant_df['affluence'].replace([1.0, 2.0, 3.0, 4.0],['$', '$$', '$$$', '$$$$'])
restaurant_df[restaurant_df['affluence'].notna()==True]
Saving the final dataframe into CSV
final_df = restaurant_df[['name', 'latitude', 'longitude', 'rating', 'number_of_reviews', 'affluence']]
display(final_df)
final_df.to_csv("bangladesh_restaurants.csv", index=False, encoding='utf-8')
bd_restaurant = pd.read_csv("/content/bangladesh_restaurants.csv", encoding='utf-8')
display(bd_restaurant)
bd_restaurant.describe()
Looks like some of the names are in Bangla. Lets separate the restaurants' that have their names in Bangla.
reg = re.compile(r'[a-zA-Z]')
bd_restaurant["name_type"] = bd_restaurant["name"].apply(lambda x: "English" if reg.match(x) else "Bangla")
en_bd_restaurant = bd_restaurant[bd_restaurant['name_type'] == "English"]
non_en_bd_restaurant = bd_restaurant[bd_restaurant['name_type'] == "Bangla"]
printmd("### Restaurants With English Name")
display(en_bd_restaurant)
printmd("### Restaurants With Bangla Name")
display(non_en_bd_restaurant)
data = en_bd_restaurant.name.value_counts().to_dict()
wc = WordCloud(width=800, height=400,background_color="white", max_font_size=300).generate_from_frequencies(data)
plt.figure(figsize=(14,10))
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()
result = wc.to_file("English_word_cloud.png")
printmd("### These are the Most Frequently Used Restaurant Names in English")
from bnlp.corpus import stopwords, punctuations
regex = r"[\u0980-\u09FF]+"
data = non_en_bd_restaurant.name.value_counts().to_dict()
wc = WordCloud(width=800, height=400,background_color="white", max_font_size=300, font_path="/content/Siyamrupali.ttf", regexp=regex).generate_from_frequencies(data)
plt.figure(figsize=(14,10))
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()
result = wc.to_file("Bangla_word_cloud.png")
printmd("### These are the Most Frequently Used Restaurant Names in Bangla")
import geopandas
import folium
from folium.plugins import MarkerCluster, HeatMap
geometry = geopandas.points_from_xy(bd_restaurant.longitude, bd_restaurant.latitude)
geo_df = geopandas.GeoDataFrame(bd_restaurant[['longitude', 'latitude']], geometry=geometry)
geo_df.head()
bd_coordinate = [23.6850, 90.3563]
site_map = folium.Map(location=bd_coordinate, tiles='Cartodb dark_matter', zoom_start=8)
heat_data = [[point.xy[1][0], point.xy[0][0]] for point in geo_df.geometry ]
# heat_data
HeatMap(heat_data).add_to(site_map)
site_map
Plotting only those restaurants that have price levels
$ means Cheap$$ means Moderate$$$ means Expensive$$$ means Very Expensive bd_coordinate = [23.6850, 90.3563]
site_map = folium.Map(location=bd_coordinate, zoom_start=7)
data = bd_restaurant[bd_restaurant['affluence'].notna()==True]
for i in range(0, len(data)):
folium.Marker(
location=[data.iloc[i]['latitude'], data.iloc[i]['longitude']],
popup=data.iloc[i]['name'],
tooltip=str(data.iloc[i]['name'])+','+str(data.iloc[i]['affluence'])
).add_to(site_map)
site_map
bd_coordinate = [23.6850, 90.3563]
circle_map = folium.Map(location=bd_coordinate, zoom_start=8, prefer_canvas=True,)
data = bd_restaurant[bd_restaurant['affluence'].notna()==True]
data['number_of_reviews'].fillna(0, inplace=True)
data['number_of_reviews'] = data['number_of_reviews'].astype(int, errors='ignore')
occurences = folium.map.FeatureGroup()
n_mean = data['number_of_reviews'].mean()
for lat, lng, number, name in zip(data['latitude'],
data['longitude'],
data['number_of_reviews'], data['name']):
occurences.add_child(
folium.vector_layers.CircleMarker(
[lat, lng],
radius=number/(n_mean/3), # radius for number of occurrences
color='yellow',
fill=True,
fill_color='blue',
fill_opacity=0.4,
tooltip=str(number)+','+str(name),
# get more from tooltip https://github.com/python-visualization/folium/issues/1010#issuecomment-435968337
)
)
circle_map.add_child(occurences)
data = bd_restaurant[bd_restaurant['affluence'].notna()==True]
data_expensive = data[data['affluence'] == "$$$"]
bd_coordinate = [23.6850, 90.3563]
expensive_map = folium.Map(location=bd_coordinate, zoom_start=10, prefer_canvas=True,)
for i in range(0, len(data_expensive)):
folium.Marker(
location=[data_expensive.iloc[i]['latitude'], data_expensive.iloc[i]['longitude']],
# popup=data_expensive.iloc[i]['name'],
tooltip=str(data_expensive.iloc[i]['name'])+','+str(data_expensive.iloc[i]['rating'])
).add_to(expensive_map)
expensive_map
data = bd_restaurant[bd_restaurant['affluence'].notna()==True]
data_very_expensive = data[data['affluence'] == "$$$$"]
bd_coordinate = [23.6850, 90.3563]
very_expensive_map = folium.Map(location=bd_coordinate, zoom_start=10, prefer_canvas=True,)
for i in range(0, len(data_very_expensive)):
folium.Marker(
location=[data_very_expensive.iloc[i]['latitude'], data_very_expensive.iloc[i]['longitude']],
# popup=data_expensive.iloc[i]['name'],
tooltip=str(data_very_expensive.iloc[i]['name'])+','+str(data_very_expensive.iloc[i]['rating'])
).add_to(very_expensive_map)
very_expensive_map
The dataset may contain some anomalies such as Tea Stores or Food Stores that are also registered under Restaurant keyword. More extensive cleaning can be done to handle such issues in the future.